data <- read.csv("/home/bambrozi/2_CORTISOL/Data/T4_Elora_Data_04_25_2024.csv")
# Replace "treated" with NA
data$T4Cortisol[data$T4Cortisol == "treated" | data$T4Cortisol == "Treated at T2" | data$T4Cortisol == "treated at T2"] <- NA
# Convert the column to numeric, coercing non-numeric values to NA
data$T4Cortisol <- as.numeric(as.character(data$T4Cortisol))
#Filtering only the lines with values
data <- data[!is.na(data$T4Cortisol),]
#creating new data file cleaned
write.csv(data, "/home/bambrozi/2_CORTISOL/Data/data_clean.csv", row.names = F)
print(data)
# Summary Statistics
summary(data$T4Cortisol)
# Histogram
hist(data$T4Cortisol, breaks = 20, main = "Histogram of T4 Cortisol", xlab = "T4 Cortisol")
# Boxplot
boxplot(data$T4Cortisol, main = "Boxplot of T4 Cortisol", ylab = "T4 Cortisol")
# Density Plot
plot(density(data$T4Cortisol), main = "Density Plot of T4 Cortisol", xlab = "T4 Cortisol", ylab = "Density")
# Calculate the theoretical quantiles
qqnorm(data$T4Cortisol, main = "QQ Plot of T4Cortisol", xlim = c(min(qqnorm(data$T4Cortisol)$x), max(qqnorm(data$T4Cortisol)$x)), ylim = c(min(qqnorm(data$T4Cortisol)$y), max(qqnorm(data$T4Cortisol)$y) + 2 * IQR(qqnorm(data$T4Cortisol)$y)))
# Add the QQ line
qqline(data$T4Cortisol, col = "red")
Summary statistics
Histogram
Density
Box_Plot
qq_Plot
Shapiro-Wilk normality test
I received from Umesh a e-mail informing the three categories that the animals could be sorted based on their cortisol concentration.
data$Categorical <- ifelse(data$T4Cortisol >= 956, "H",
ifelse(data$T4Cortisol <= 190.8, "L", "M"))
table(data$Categorical)
library(ggplot2)
# Reorder the levels of the 'Categorical' column
data$Categorical <- factor(data$Categorical, levels = c("L", "M", "H"))
# Create the histogram with reordered categories
ggplot(data, aes(x = Categorical, fill = Categorical)) +
geom_bar() +
labs(title = "Histogram of T4Cortisol by Category",
x = "Category",
y = "Frequency") +
theme_minimal()
# Create the histogram
ggplot(data, aes(x = T4Cortisol, fill = Categorical)) +
geom_histogram(binwidth = 50, color = "black", alpha = 0.7) + # Adjust binwidth as needed
labs(title = "Histogram of T4Cortisol with Color by Category",
x = "T4 Cortisol",
y = "Frequency",
fill = "Category") +
scale_fill_manual(values = c("H" = "red", "M" = "blue", "L" = "green")) + # Adjust colors if needed
theme_minimal()
# Create the density plot
ggplot(data, aes(x = T4Cortisol, fill = Categorical)) +
geom_density(alpha = 0.3) +
labs(title = "Density Plot of T4Cortisol with Color by Category",
x = "T4Cortisol",
y = "Density",
fill = "Category") +
scale_fill_manual(values = c("H" = "red", "M" = "blue", "L" = "green")) + # Adjust colors if needed
theme_minimal()
# Create a density plot
ggplot(data, aes(x = T4Cortisol)) +
geom_density() +
geom_vline(xintercept = c(956, 190.8), linetype = "dashed", color = "red") +
labs(title = "Density Plot of T4Cortisol with Vertical Lines",
x = "T4Cortisol",
y = "Density") +
theme_minimal()
The animals were sorted in these three categories >H = Hight >M = Medium >L = Low
The individuals frequency distribution in theese categories are shown in the plots below
Observing the previous plots I tried to remove the “outliers” phenotypes above 1250, but the outcome from Shapiro test is still indicating no normality of the data.
library(tidyverse)
data_no_out <- filter(data, T4Cortisol < 1250)
# Create QQ plot
qqnorm(data_no_out$T4Cortisol, main = "QQ Plot of T4Cortisol", xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(data$T4Cortisol, col = "red")
boxplot(data_no_out$T4Cortisol, main = "Boxplot of T4 Cortisol", ylab = "T4 Cortisol")
hist(data_no_out$T4Cortisol, breaks = 20, main = "Histogram of T4 Cortisol", xlab = "T4 Cortisol")
shapiro.test(data$T4Cortisol)
To assess the correlation between Cortisol phenotypes and Genomic Estimated Breeding Values (GEBVs), we opt for a linear regression instead of a standard correlation test. This decision is driven by the non-normal distribution of our Cortisol phenotypes, which violates the assumptions required for traditional correlation tests.
Linear regression offers a robust alternative as it does not necessitate normality for the dependent variable. By regressing GEBVs over Cortisol, we can model the relationship between these variables. Our aim is to estimate the regression coefficient, which serves as our correlation estimate.
Due to the violation of normality assumptions for the dependent variable (Cortisol), traditional correlation tests may not provide reliable results, particularly in assessing the significance of the correlation. Therefore, alternative approaches, such as linear regression, are preferred as they do not require the same assumptions about the distribution of the dependent variable. By using linear regression, we can still assess the relationship between Cortisol and GEBVs while accommodating the non-normality of Cortisol phenotypes.
The regression model can be represented as follows: \[ y = \beta_0 + \beta_1 \times GEBV_{\text{Milk}} + \epsilon \]
Where:
This approach enables us to quantify the relationship between Cortisol and GEBVs, addressing the non-normality of Cortisol phenotypes while allowing for formal hypothesis testing of the correlation’s significance.
The first data I received from Lucas had only 135 animals out of 260 with values the other 125 had only NA I shown this to Lucas Lucas wrote to Alisson Lucas sent me the missing animals I merged this two files
rm(list = ls())
# Load the necessary library
library(dplyr)
library(tidyverse)
cortisol_260 <- read.csv("/home/bambrozi/2_CORTISOL/Data/data_clean.csv")
#This is the first dataframe with information for 135 animals and 125 NA
GEBVs1 <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/ebvs_elora.csv")
#This is the second file with information for the 125 NA animals
GEBVs2 <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/elora_missing_females_2404_06_11_2024.csv")
#This are de columns we can use because we know the meaning of the acronyms
GEBVs_to_use <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/ebv_names_lucas_06102024_BAG.csv")
sum(is.na(GEBVs1$MILK))
GEBVs1<- GEBVs1[which(is.na(GEBVs1[,"DHI_BARN_NAME"]) == F),]
sum(!is.na(GEBVs2$MILK))
GEBVs2<- GEBVs2[which(is.na(GEBVs2[,"DHI_BARN_NAME"]) == F),]
print(GEBVs1$DHI_BARN_NAME)
print(GEBVs2$DHI_BARN_NAME)
# Making the two dataframes with the same columns
# Remove elora_id and international_id from GEBVs1
GEBVs1 <- GEBVs1 %>% select(-elora_id, -international_id)
# Remove ANIMAL_ID from GEBVs2
GEBVs2 <- GEBVs2 %>% select(-ANIMAL_ID)
# Check if the two dataframes have the same columns
have_same_columns <- all(names(GEBVs1) == names(GEBVs2))
if (have_same_columns) {
print("The dataframes have the same columns.")
} else {
print("The dataframes do not have the same columns.")
}
# Check if the column names are in the same order
same_order <- identical(names(GEBVs1), names(GEBVs2))
if (same_order) {
print("The columns are in the same order.")
} else {
print("The columns are not in the same order.")
}
GEBVs_combined <- rbind(GEBVs1, GEBVs2)
# Sort the columns
sorted_cortisol_260 <- sort(cortisol_260$ID)
sorted_GEBVs_combined <- sort(GEBVs_combined$DHI_BARN_NAME)
# Check if the sorted columns have the same values
identical(sorted_cortisol_260, sorted_GEBVs_combined)
# Create a duplicate of the column 'DHI_BARN_NAME' and name it 'elora_id'
GEBVs_combined$elora_id <- GEBVs_combined$DHI_BARN_NAME
# Assuming GEBVs_combined is your data frame
GEBVs_combined <- GEBVs_combined %>%
select(elora_id, DHI_BARN_NAME, everything())
write.csv(GEBVs_combined, "/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/ebvs_elora_complete.csv")
# Merging the dataframe with Cortisol values, with the dataframe with GEBVs values
Merg_Cort_GEBVs <- merge(cortisol_260, GEBVs_combined, by.x = "ID", by.y = "elora_id")
write.csv(Merg_Cort_GEBVs, "/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/Merged_Cortisol_GEBVs.csv")
#Opening the file with the GEBVs columns to use
Columns_to_use <- readLines("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/traits_to_use.txt")
colnames(Merg_Cort_GEBVs)[405] <- "IDD"
data <- select(Merg_Cort_GEBVs, ID, T4Cortisol, BIRTH_YEAR, all_of(Columns_to_use))
# The data below has the the 55 GEBVs + Cortisol data + Birth Year
write.csv(data, "/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/data_GEBVs_Cortisol_select_traits.csv")
samp_date2 <- read.csv("/home/bambrozi/2_CORTISOL/Data/Elora animal_ids_kl_sampling_date.csv")
# Convert Sampling_date to Date using as.Date
samp_date$Sampling_date <- as.Date(samp_date$Sampling_date, format = "%m/%d/%Y")
table(samp_date$Sampling_date)
samp_date <- select(samp_date, Elora_id, Sampling_date)
# Check if data$ID and samp_dates$elora_id are identical in values and order
identical(data$ID, samp_date$Elora_id)
data_final <- merge(data, samp_date, by.x="ID", by.y="Elora_id")
data_final <- data_final %>%
select(ID, T4Cortisol, BIRTH_YEAR, Sampling_date, everything())
# The data below has the the 55 GEBVs + Cortisol data + Birth Year + Sampling data
write.csv(data_final, "/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/data_GEBVs_Cortisol_select_traits2.csv")
ps. I double checked by hand the select and merge process against the original tables received and is everything ok.
We tested the minimum model, adding only Birth Year, but adding Birth Year and Sampling Date we got the best results.
The regression model added the BY and SAMPLING DATE is shown bellow:
\[ y = \beta_0 + \beta_1 \times GEBV_{\text{Trait}} + BIRTH\_YEAR + SAMPLING\_DATE + \epsilon \]
Where:
The SAMPLING_DATE variable is also converted to a factor
to account for the categorical nature of sampling date.
# Convert BIRTH_YEAR to a factor and rename
data_final$BIRTH_YEAR <- as.factor(data_final$BIRTH_YEAR)
# Convert Sampling_data to a factor and rename
data_final$Sampling_date <- as.factor(data_final$Sampling_date)
# Initialize a list to store the results
results_list <- list()
# Loop through columns 3 to ncol(data) for the GEBVs
for (i in 5:ncol(data_final)) {
trait_name <- colnames(data_final)[i]
# Fit the linear regression model with BIRTH_YEAR as an additional predictor
model <- lm(data_final[[2]] ~ data_final[[i]] + data_final$BIRTH_YEAR + data_final$Sampling_date , data = data_final)
# Summarize the model
model_summary <- summary(model)
# Extract the desired statistics
multiple_r_squared <- model_summary$r.squared
adjusted_r_squared <- model_summary$adj.r.squared
f_statistic <- model_summary$fstatistic[1] # F-statistic value
f_num_df <- model_summary$fstatistic[2] # Numerator degrees of freedom
f_den_df <- model_summary$fstatistic[3] # Denominator degrees of freedom
p_value <- pf(f_statistic, f_num_df, f_den_df, lower.tail = FALSE) # P-value
# Extract the coefficient and its p-value for the trait
coef_summary <- coef(model_summary)
trait_coef <- coef_summary[2, "Estimate"] # Assumes the trait is the second predictor
trait_p_value <- coef_summary[2, "Pr(>|t|)"]
# Combine the statistics into a data frame
result <- data.frame(
Trait = trait_name,
Multiple_R_Squared = multiple_r_squared,
Adjusted_R_Squared = adjusted_r_squared,
F_Statistic = f_statistic,
P_Value = p_value,
Coefficient = trait_coef,
Coefficient_P_Value = trait_p_value
)
# Append the result to the results list
results_list[[i - 2]] <- result
}
# Combine all results into a single data frame
results_df <- do.call(rbind, results_list)
# Save the results to a CSV file
write.csv(results_df, file = "/home/bambrozi/2_CORTISOL/Correlation/Results/add_BY_SAMP/regression_summary_all_traits_BY_SampDt.csv", row.names = FALSE)
Summary statistics for all Traits’ GEBVs adding BIRTY_YEAR and SAMPLING_DATE
| Trait | Multiple_R_Squared | Adjusted_R_Squared | F_Statistic | P_Value | Coefficient | Coefficient_P_Value |
|---|---|---|---|---|---|---|
| CO | 0.3913617 | 0.3257757 | 5.967151 | 0 | -11.1849879 | 0.0099861 |
| BMR | 0.3899307 | 0.3241904 | 5.931386 | 0 | 14.4612444 | 0.0135665 |
| LP | 0.3858330 | 0.3196512 | 5.829896 | 0 | 12.2619950 | 0.0330355 |
| MILK | 0.3850052 | 0.3187343 | 5.809559 | 0 | 0.0671160 | 0.0396652 |
| PROT | 0.3841834 | 0.3178238 | 5.789422 | 0 | 2.2166395 | 0.0476301 |
| UT | 0.3822859 | 0.3157218 | 5.743130 | 0 | -12.1238483 | 0.0731558 |
| CK | 0.3801906 | 0.3134008 | 5.692345 | 0 | 12.7333709 | 0.1192726 |
| HHE | 0.3795579 | 0.3126999 | 5.677076 | 0 | 7.4003505 | 0.1388525 |
| MSP | 0.3793006 | 0.3124149 | 5.670876 | 0 | -7.2388952 | 0.1478152 |
| DA | 0.3791391 | 0.3122360 | 5.666989 | 0 | 10.8037510 | 0.1537699 |
| TU | 0.3785722 | 0.3116080 | 5.653351 | 0 | -5.2064639 | 0.1769405 |
| MSL | 0.3777098 | 0.3106526 | 5.632656 | 0 | -8.6230634 | 0.2203509 |
| BQ | 0.3775224 | 0.3104450 | 5.628166 | 0 | -7.7284891 | 0.2313766 |
| IH | 0.3772456 | 0.3101385 | 5.621542 | 0 | -4.7354916 | 0.2488963 |
| ST | 0.3767980 | 0.3096426 | 5.610839 | 0 | -9.9721644 | 0.2808121 |
| LOC | 0.3766890 | 0.3095218 | 5.608233 | 0 | -7.2367609 | 0.2893509 |
| FAT | 0.3765584 | 0.3093772 | 5.605116 | 0 | 0.8520581 | 0.3000062 |
| UD | 0.3763515 | 0.3091480 | 5.600177 | 0 | -9.3297651 | 0.3179533 |
| CA | 0.3757417 | 0.3084725 | 5.585641 | 0 | 6.0756898 | 0.3798799 |
| MS | 0.3755038 | 0.3082090 | 5.579979 | 0 | -6.0375749 | 0.4085912 |
| SCK | 0.3754363 | 0.3081342 | 5.578372 | 0 | -4.4161473 | 0.4173165 |
| SCS | 0.3754033 | 0.3080976 | 5.577587 | 0 | 3.9080681 | 0.4216769 |
| IDD | 0.3752677 | 0.3079474 | 5.574362 | 0 | 3.7029601 | 0.4403519 |
| FOOT | 0.3751830 | 0.3078536 | 5.572349 | 0 | 17.3309360 | 0.4526548 |
| TL | 0.3750803 | 0.3077398 | 5.569908 | 0 | -6.6585204 | 0.4683140 |
| MET | 0.3749055 | 0.3075462 | 5.565755 | 0 | -3.4014191 | 0.4970656 |
| CTFS | 0.3747600 | 0.3073850 | 5.562300 | 0 | 4.0557716 | 0.5233387 |
| CONF | 0.3746137 | 0.3072229 | 5.558828 | 0 | -4.7896643 | 0.5523352 |
| FE | 0.3745078 | 0.3071056 | 5.556316 | 0 | -2.9375586 | 0.5752627 |
| HL | 0.3743988 | 0.3069849 | 5.553731 | 0 | 3.4274416 | 0.6009102 |
| FA | 0.3742870 | 0.3068610 | 5.551080 | 0 | -3.2821641 | 0.6298652 |
| DD | 0.3742836 | 0.3068573 | 5.551001 | 0 | 2.5402141 | 0.6307730 |
| MOB | 0.3742556 | 0.3068263 | 5.550337 | 0 | 3.0592146 | 0.6385442 |
| FTP | 0.3742359 | 0.3068045 | 5.549870 | 0 | 4.8583743 | 0.6441419 |
| BCS | 0.3741681 | 0.3067293 | 5.548263 | 0 | 2.6692352 | 0.6643615 |
| HH | 0.3740753 | 0.3066266 | 5.546066 | 0 | 2.0230249 | 0.6947785 |
| DF | 0.3740392 | 0.3065865 | 5.545209 | 0 | 2.3681806 | 0.7076996 |
| FL | 0.3739824 | 0.3065237 | 5.543865 | 0 | -2.1711035 | 0.7294613 |
| UF | 0.3739605 | 0.3064994 | 5.543347 | 0 | 2.8880683 | 0.7384413 |
| MDR | 0.3738855 | 0.3064163 | 5.541571 | 0 | 1.8495373 | 0.7722558 |
| WL | 0.3738548 | 0.3063822 | 5.540843 | 0 | 1.2643028 | 0.7878801 |
| AFS | 0.3738293 | 0.3063540 | 5.540239 | 0 | 1.5814380 | 0.8018670 |
| RUM | 0.3738023 | 0.3063241 | 5.539601 | 0 | -1.2574561 | 0.8179113 |
| SH | 0.3738015 | 0.3063233 | 5.539583 | 0 | 1.0342843 | 0.8184040 |
| BD | 0.3736958 | 0.3062061 | 5.537080 | 0 | 0.7613199 | 0.9071017 |
| MR | 0.3736957 | 0.3062060 | 5.537078 | 0 | 0.6291752 | 0.9071898 |
| SU | 0.3736871 | 0.3061965 | 5.536876 | 0 | 0.4745538 | 0.9186634 |
| FEED | 0.3736818 | 0.3061907 | 5.536751 | 0 | 0.3944746 | 0.9266753 |
| DHL | 0.3736774 | 0.3061858 | 5.536647 | 0 | -0.5919776 | 0.9340753 |
| DO | 0.3736757 | 0.3061838 | 5.536604 | 0 | 0.5220723 | 0.9373261 |
| DS | 0.3736695 | 0.3061769 | 5.536458 | 0 | 0.3979132 | 0.9502668 |
| CW | 0.3736676 | 0.3061749 | 5.536414 | 0 | 0.3562368 | 0.9547808 |
| ME | 0.3736658 | 0.3061729 | 5.536370 | 0 | -0.2401976 | 0.9599012 |
| MT | 0.3736595 | 0.3061659 | 5.536223 | 0 | -0.0976386 | 0.9882642 |
| DCA | 0.3736593 | 0.3061657 | 5.536217 | 0 | 0.0802555 | 0.9906432 |
Lucas Alcântara sent me the path to the genotype and pedigree files: /data/cgil/daiclu/6_Data/database/public_output/bruno
In this folder we found the following files:
I made a copy of this files in a folder called Raw_files:
/home/bambrozi/2_CORTISOL/RawFiles
This directory has two sub-directories:To perfome this I used two codes
sed -i '1i id Call' genotypes.txt
filename = 'bruno_gntps.txt'
outputFileOpen = open('genoplink.ped','w')
recode = {'0':['C','C'] , '1':['A','C'] , '2':['A','A'] , '5':['0','0'] }
for line in open(filename,'r'):
if 'Call' in line : continue
ids, call = line.strip().split()
genotypes = [ recode[geno012][0] +' '+ recode[geno012][1] for geno012 in call ]
outputFileOpen.write("%s %s %s %s %s %s %s\n" % ('HO', ids, '0','0','0','-9',' '.join(genotypes)) )
Now I have a folder called /home/bambrozi/2_CORTISOL/Geno_files with the file named genoplink.ped
library(data.table)
pheno <- read.csv("/home/bambrozi/2_CORTISOL/Data/data_clean.csv")
ped <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/Pedigree/bruno_ids.csv")
geno <- fread("/home/bambrozi/2_CORTISOL/Geno_files/genoplink.ped")
geno <- geno[,c("V2")]
#Bringing cdn_id to my phenotype file
#Generate a index with the match
matching_indices <- match(pheno$ID, ped$elora_id)
# Then, assign 'cdn_id' from 'ped' to 'pheno' where there are matches
pheno$cdn_id <- ifelse(!is.na(matching_indices), ped$cdn_id[matching_indices], NA)
#Making a phenotype file only with genotyped animals
pheno_genotyped <- pheno[pheno$cdn_id %in% geno$V2,]
#check if all animals in this file are genotyped
checkk <- pheno_genotyped$cdn_id %in% geno$V2
sum(checkk)
#The phenotype file should have three columns: FID, Animal_id, Phenotype
HO <- rep("HO", 252)
pheno_gwas <- as.data.frame(cbind(HO, pheno_genotyped$cdn_id, pheno_genotyped$T4Cortisol))
colnames(pheno_gwas) <- c("FID", "cdn_id", "cortisol")
pheno_gwas$cdn_id <- as.numeric(pheno_gwas$cdn_id)
pheno_gwas$cortisol <- round(as.numeric(pheno_gwas$cortisol),2)
write.table(pheno_gwas, "/home/bambrozi/2_CORTISOL/GWAS/pheno_genotyped.txt", quote = F, row.names = F, col.names = T)
Adjusting the SNP_map to .map
map <- fread("/data/cgil/daiclu/6_Data/database/public_output/bruno/DGVsnpinfo.2404.ho")
morgan <- data.frame(X0 = rep(0, 45101))
mapa=as.data.frame(cbind(map$chr, map$snp_name, morgan$X0, map$location))
head(mapa)
write.table(x = mapa, file = "/home/bambrozi/2_CORTISOL/Geno_files/genoplink.map", row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)
system("/home/local/bin/plink --cow --nonfounders --allow-no-sex --keep-allele-order --file /home/bambrozi/2_CORTISOL/Geno_files/genoplink --make-bed --out /home/bambrozi/2_CORTISOL/Geno_files/genoplink")
With the code above I generated the bfiles:
We ran the code below to perfom the QC okok
#!/bin/bash
DATA=/home/bambrozi/2_CORTISOL/Geno_files/genoplink
RESULT=/home/bambrozi/2_CORTISOL/Geno_files_after_QC/genoplink_clean
/home/local/bin/plink \
--bfile ${DATA} \
--cow \
--allow-no-sex \
--hwe 1e-5 \
--maf 0.01 \
--geno 0.1 \
--mind 0.1 \
--keep-allele-order \
--make-bed \
--out ${RESULT}
The server screen outcome is shown below.
After the Quality Control we ended up with
To check for duplicated individuals I performed the KINSHIP analysis using one script from Larissa Braga. Running the King Analysis on Plink. okok
#!/bin/bash
DATA=/home/bambrozi/2_CORTISOL/Geno_files_after_QC/genoplink_clean
RESULT=/home/bambrozi/2_CORTISOL/Geno_files_after_KING/result_king
plink2 \
--bfile ${DATA} \
--cow \
--make-king-table \
--out ${RESULT}
This is the output screen on terminal:
The table below is the output /home/bambrozi/2_CORTISOL/Geno_files_after_KING/result_king.kin0 and have pairwise comparisons between all individuals.
Now we should open in R and check for individuals with more than 0,354, to perform this we can use the code below, also provided by Larissa Braga: okok
setwd("/home/bambrozi/2_CORTISOL/Geno_files_after_KING")
relatedness="result_king.kin0" ## change accordingly!!
library(data.table)
print(relatedness)
rel=fread(relatedness, h = T)
head(rel)
print("Individuals with different identifications above the cut off of 0.354:")
dup=subset(rel, KINSHIP >= 0.354 & IID1!=IID2)
print(dup)
nrow(dup)
So the code above will provide this output if there is no duplicated individual.
We do not have any duplicated individual
So the file to be used are those in the directory /home/bambrozi/2_CORTISOL/Geno_files_after_QC
files:genoplink_clean
After Quality Control we didn’t lost any animal, so we don’t need to update our phenotype file
Now before performing the PCA analysis we need to change the FID for those individuals that has phenotype = 1 for Nadia.
#!/bin/bash
DATA=/home/bambrozi/2_CORTISOL/Geno_files_after_QC/genoplink_clean
RESULT=/home/bambrozi/2_CORTISOL/PCA/pca_result
plink \
--bfile ${DATA} \
--keep-allele-order \
--cow \
--pca \
--out ${RESULT}
The PCA output:
After generate the Eigenvalues and Eigenvectors I used the code below to generate the PCA Plot
setwd("/home/bambrozi/2_CORTISOL/PCA")
library(ggplot2)
library(tidyverse)
##
# Visualize PCA results
###
# read in result files
eigenValues <- read_delim("pca_result.eigenval", delim = " ", col_names = F)
eigenVectors <- read_delim("pca_result.eigenvec", delim = " ", col_names = F)
## Proportion of variation captured by each vector
eigen_percent <- round((eigenValues / (sum(eigenValues))*100), 2)
# PCA plot
png("pca-plink.eng.png", width=5, height=5, units="in", res=300)
ggplot(data = eigenVectors) +
geom_point(mapping = aes(x = X3, y = X4), color = "red", shape = 19, size = 1, alpha = 1) +
geom_hline(yintercept = 0, linetype="dotted") +
geom_vline(xintercept = 0, linetype="dotted") +
labs(x = paste0("Principal component 1 (", eigen_percent[1,1], " %)"),
y = paste0("Principal component 2 (", eigen_percent[2,1], " %)")) +
theme_minimal()
dev.off()
# PCA plot with animal ids
png("pca-plink.eng.animals_id.png", width=50, height=50, units="in", res=300)
ggplot(data = eigenVectors) +
geom_point(mapping = aes(x = X3, y = X4), color = "red", shape = 19, size = 5, alpha = 1) +
geom_text(mapping = aes(x = X3, y = X4, label = X2), size = 2, hjust = 0, vjust = 0) + # Add labels for animal IDs
geom_hline(yintercept = 0, linetype="dotted") +
geom_vline(xintercept = 0, linetype="dotted") +
labs(x = paste0("Principal component 1 (", eigen_percent[1,1], " %)"),
y = paste0("Principal component 2 (", eigen_percent[2,1], " %)")) +
theme_minimal()
dev.off()
I received from Umesh a e-mail informing the three categories that the animals could be sorted based on their cortisol concentration.
pheno <- read.table("/home/bambrozi/2_CORTISOL/GWAS/pheno_genotyped.txt", header = T)
data_final <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/data_GEBVs_Cortisol_select_traits2.csv", header = T)
ids_eq <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/Pedigree/bruno_ids.csv", header = T)
# Create an matrix with fixed effects with only those animals which also have phenotype and genotype
data_final$cdn_id <- ids_eq$cdn_id[match(data_final$ID, ids_eq$elora_id)]
fixeff <- data_final[,c("ID", "BIRTH_YEAR", "Sampling_date", "cdn_id")]
fixeff <- fixeff[fixeff$cdn_id %in% pheno$cdn_id, ]
fixeff$FID <- "HO"
fixeff <- fixeff[, c("FID", "cdn_id", "BIRTH_YEAR", "Sampling_date")]
# Now we are gona remove the intermediary animals from pheno object
pheno$Categorical <- ifelse(pheno$cortisol >= 956, "H",
ifelse(pheno$cortisol <= 190.8, "L", "M"))
table(pheno$Categorical)
pheno <- pheno[pheno$Categorical != "M", ]
pheno <- pheno[, c("FID", "cdn_id", "cortisol")]
# Now we are going to remove from fixeff the animals which are not in pheno
fixeff <- fixeff[fixeff$cdn_id %in% pheno$cdn_id,]
#Checking if match the animals and order
identical(fixeff$cdn_id, pheno$cdn_id)
#Creating a file with animals to keep in the genotype file, we will use it on Plink
to_keep_geno <- pheno[, c("FID", "cdn_id")]
write.table(fixeff, "/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/fixeff.txt", quote = F, row.names = F, col.names = T)
write.table(pheno, "/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/pheno.txt", quote = F, row.names = F, col.names = T)
write.table(to_keep_geno, "/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/to_keep_geno.txt", quote = F, row.names = F, col.names = F)
We ended up with
H (Hight) = 34 animals
L (Low) = 37 animals Total = 71 animals
On Plink we will remove all individuals from genotype files that are classified as Medium, keeping only the High and Low
#!/bin/bash
DATA=/home/bambrozi/2_CORTISOL/Geno_files_after_QC/genoplink_clean
RESULT=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/geno_extreme
KEEP=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/to_keep_geno.txt
plink2 --bfile ${DATA} --keep ${KEEP} --chr-set 29 --make-bed --out ${RESULT}
#!/bin/bash
DATA=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/geno_extreme
RESULT=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/GWAS_result
PHENO=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/pheno.txt
FIXEFF=/home/bambrozi/2_CORTISOL/GWAS/EXTREM_PHENO_BY_SD/fixeff.txt
/home/local/bin/gcta \
--bfile ${DATA} \
--mlma-loco \
--pheno ${PHENO} \
--qcovar ${FIXEFF} \
--autosome-num 29 \
--autosome \
--out ${RESULT}
After ran the GWAS above I got the following message from the GCTA:
Error: analysis stopped because more than half of the variance components are constrained. The result would be unreliable. You may have a try of the option –reml-no-constrain.
As we got this error message, we needed to solve this problem, and for that we used the whole sample size (252 individuals) to estimate the variance components, and after this, using this variance components from the whole sample we performed the ssGWAS with the subset of individuals (34 High + 37 Low = 71), but this was not possible in GCTA so we switched to another software (BLUPF90+)
To run ssGWAS on Blupf90+ suite, we will need 4 different softwares:
The tutorial for preGSF90 and postGSF90 we can find in the link bellow https://nce.ads.uga.edu/wiki/doku.php?id=readme.pregsf90#gwas_options_postgsf90
According to the BLUPF90+ tutorial:
ssGWAS is an iterative procedure with 2 steps:
0) Quality control
1) prediction of GEBV with ssGBLUP
2) prediction of SNP marker effects based on the GEBV
Preparing files to run Variance components estimation using REML with AI (Average Information) algorithm.
First you need to create a directory in your home directory, prepare and save the following files in:
The appearance of this file is like this:
FIRST COLUMN = Animal ID
SECOND COLUMN = Phenotype
THIRD COLUMN = Fixed Effect 1
FOURTH COLUMN = Fixed Effect 2
First we are going to generate a Phenotype_Fixed_Effect file with the whole sample (252 individuals) that we are going to use for the Variance Components Estimation.
To get in one file these four columns we need the following code:
okok
pheno <- read.table("/home/bambrozi/2_CORTISOL/GWAS/pheno_genotyped.txt", header = T)
data_final <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/GEBVs_Elora/data_GEBVs_Cortisol_select_traits2.csv", header = T)
ids_eq <- read.csv("/home/bambrozi/2_CORTISOL/RawFiles/Pedigree/bruno_ids.csv", header = T)
# Create data_final$cdn_id by matching IDs with elora_id
data_final$cdn_id <- ids_eq$cdn_id[match(data_final$ID, ids_eq$elora_id)]
fixeff <- data_final[,c("ID", "BIRTH_YEAR", "Sampling_date", "cdn_id")]
fixeff <- fixeff[fixeff$cdn_id %in% pheno$cdn_id, ]
fixeff$FID <- "HO"
fixeff <- fixeff[, c("FID", "cdn_id", "BIRTH_YEAR", "Sampling_date")]
identical(fixeff$cdn_id, pheno$cdn_id)
fixeff <- fixeff[, c("cdn_id", "BIRTH_YEAR", "Sampling_date")]
pheno <- pheno[,c("cdn_id", "cortisol")]
# Load necessary libraries
library(dplyr)
# Merge pheno and fixeff data frames
merged_data <- pheno %>%
left_join(fixeff, by = "cdn_id")
merged_data$iid <- ids_eq$iid[match(merged_data$cdn_id, ids_eq$cdn_id)]
merged_data <- merged_data[, c("iid", "cortisol", "BIRTH_YEAR", "Sampling_date")]
write.table(merged_data, "/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/fenofix.txt", col.names = F, quote = F, row.names = F)
The file should be saved as text file, with separation by space and no columns names.
PS: if there are any NA, they sould be replaced by 9999
/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/fenofix.txt (this file has 252 individuals)
The appearance of this file is like this:
FIRST COLUMN = Animal ID
SECOND COLUMN = Sire ID
THIRD COLUMN = Dam ID
The file should be saved as text file, with separation by space and no columns names.
We used the code below to remove the commas of a .csv file to a file with sepation by spaces.
/home/bambrozi/2_CORTISOL/RawFiles/Pedigree okok
# to replace comma for space in the .csv file with the equivalence among IDs
# /home/bambrozi/2_CORTISOL/RawFiles/Pedigree
sed 's/,/ /g' bruno_ped.iid.csv > bruno_ped_iid_blup.txt
First we are going to generate a Genotype file with the whole sample (252 individuals) that we are going to use for the Variance Components Estimation.
The appearance of this file is like this:
FIRST COLUMN = Animal ID SECOND COLUMN = Genotypes (0, 1 and 2 format)
The file should be saved as text file, with separation by space and no columns names.
We used the code below to replace the cid for iid. First we merge using the second column of the firs file, and the first column of the second file. Then we use again the command awk to keep only the third and fifth columsn and sabe in a different object.
/home/bambrozi/2_CORTISOL/RawFiles/Genotypes
# Using the awk function to merge the two files and the second awk to select only the 3rd and 5fh columns
awk 'FNR==NR {a[$2]=$0; next} {print a[$1], $0}' bruno_ids.csv bruno_gntps.txt | awk '{print$3,$5}' > bruno_gntps_iid
Below we can find the file’s location from the code above: /home/bambrozi/2_CORTISOL/RawFiles/Genotypes/bruno_gntps.txt /home/bambrozi/2_CORTISOL/RawFiles/Pedigree/bruno_ids.csv /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/bruno_gntps_iid (this file has 252 individuals)
/home/bambrozi/2_CORTISOL/RawFiles/SNP_map okok
mapfile <- read.table("/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/DGVsnpinfo.2404.ho", header = T)
colnames(mapfile)
colnames(mapfile) <- c("SNP_ID", "CHR", "POS")
mapfile <- mapfile[,c("CHR", "POS", "SNP_ID")]
write.table(mapfile, "/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/snpmap.txt", col.names = T, row.names = F, quote = F)
But, before running the GEBV we will first perform one additional step to “CLEAN” our genotypes. Actually BLUPF90 by default perform a data cleaning with pre set parameters, but we will change some default parameters an so perform this additional step.
The Parameter card for this step is the parameter bellow:
/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/renum_QC.par
okok
DATAFILE
fenofix.txt
TRAITS
2
FIELDS_PASSED TO OUTPUT
WEIGHT(S)
RESIDUAL_VARIANCE
1.0
EFFECT
3 cross numer
EFFECT
4 cross alpha
EFFECT
1 cross alpha
RANDOM
animal
FILE
bruno_ped_iid_blup.txt
FILE_POS
1 2 3 0 0
SNP_FILE
bruno_gntps_iid
PED_DEPTH
0
(CO)VARIANCES
1.0
OPTION outcallrate
OPTION saveCleanSNPs
OPTION minfreq 0.01
OPTION map_file snpmap.txt
OPTION excludeCHR 30 31
To run any softwere from Blupf90 suit we will perform always in this way:
ssh grand
cd /home/bambrozi/2_CORTISOL/GWAS/BLUPF90
chmod +x renumf90
chmod +x blupf90+
./renumf90
When you run the code above, it will ask you the name of your parameter card, for this step is renum_QC.par.
The command above will generate couple files, among them renf90.par
We modified renf90.par in:The parameter card to perform the Quality Control is: /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/renf90_DataClean.par
It will run using the software pre preGS90 to generate the Clean Genotype and SNP_MAP files after Quality Control. okok
# BLUPF90 parameter file created by RENUMF90
DATAFILE
renf90.dat
NUMBER_OF_TRAITS
1
NUMBER_OF_EFFECTS
3
OBSERVATION(S)
1
WEIGHT(S)
EFFECTS: POSITIONS_IN_DATAFILE NUMBER_OF_LEVELS TYPE_OF_EFFECT[EFFECT NESTED]
2 4 cross
3 23 cross
4 3724 cross
RANDOM_RESIDUAL VALUES
1.0000
RANDOM_GROUP
3
RANDOM_TYPE
add_an_upginb
FILE
renadd03.ped
(CO)VARIANCES
1.0000
OPTION SNP_file bruno_gntps_iid
OPTION outcallrate
OPTION saveCleanSNPs
OPTION minfreq 0.01
OPTION map_file snpmap.txt
OPTION excludeCHR 30 31
The second parameter card used for Variance Components Estimation (VCE) is the following: /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/renf90_VarCompEst.par
It will be run using the software pre blupf90+ to generate the VCE. okok
# BLUPF90 parameter file created by RENUMF90
DATAFILE
renf90.dat
NUMBER_OF_TRAITS
1
NUMBER_OF_EFFECTS
3
OBSERVATION(S)
1
WEIGHT(S)
EFFECTS: POSITIONS_IN_DATAFILE NUMBER_OF_LEVELS TYPE_OF_EFFECT[EFFECT NESTED]
2 4 cross
3 23 cross
4 3724 cross
RANDOM_RESIDUAL VALUES
1.0000
RANDOM_GROUP
3
RANDOM_TYPE
add_an_upginb
FILE
renadd03.ped
(CO)VARIANCES
1.0000
OPTION SNP_file bruno_gntps_iid_clean
OPTION no_quality_control
OPTION method VCE
OPTION origID
OPTION missing 9999
OPTION se_covar_function H2_1 g_3_3_1_1/(g_3_3_1_1+r_1_1)
In the parameter card above, we remove the option for Quality Control and added options for Variance Components Estimation, for Missing data, for origID and for heritability estimation, but the MOST IMPORTANT PART is we need to change the OPTION SNP_FILE, replacing the original genotype file, for the “clean” version generated in the previous step.
The Variance Components will be placed in the file: /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/blupf90.log
blupf90.log
Now you should update you renf90_VarCompEst.par file with these informations from the .log file
Copy Residual Variance from blupf90.log and will paste on renf90_VarCompEst.par RANDOM_RESIDUAL_VALUES Copy Genetic variance for effect x from blupf90.log and will paste on renf90_VarCompEst.par (CO) VARIANCE
run blupf90+ again with the updated renf90_VarCompEst.par
If the Residual Variance and Genetic variance for effect x didn’t change in your blupf90.log the analysis ended, but if this value vary, you should update again the renf90.par and run again blupf90+ until this values don’t change more.
blupf90.log
Parameter card
Now we’ll run a new analysis using the Variance Components Estimation from the previous step to perform the GWAS.
To perform this first we need a a Genotype file only with the 71 animals (High=34 and Low=37)
In this read_me file we performed a ssGWAS but keeping the phenotype and fixed effect information for all 252 individuals, only reducing the number of individuals for the Genotype file.
I used this command line bellow to remove the individuals with MEDIUM phenotypes. okok I ran this code in /home/bambrozi/2_CORTISOL/GWAS/BLUPF90
awk 'NR==FNR{ids[$1]; next} $1 in ids' fenofix.txt bruno_gntps_iid > bruno_gntps_iid_71
And the output I coppied to /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno
Run with renumf90
/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno/renum_QC.par
okok
DATAFILE
fenofix.txt
TRAITS
2
FIELDS_PASSED TO OUTPUT
WEIGHT(S)
RESIDUAL_VARIANCE
77182
EFFECT
3 cross numer
EFFECT
4 cross alpha
EFFECT
1 cross alpha
RANDOM
animal
FILE
bruno_ped_iid_blup.txt
FILE_POS
1 2 3 0 0
SNP_FILE
bruno_gntps_iid_71
PED_DEPTH
0
(CO)VARIANCES
28212
OPTION outcallrate
OPTION saveCleanSNPs
OPTION minfreq 0.01
OPTION map_file snpmap.txt
OPTION excludeCHR 30 31
We modified renf90.par in 3 copies:
To run the parameter card bellow we are going to use the software presGSf90 /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno/renf90_DataClean.par
# BLUPF90 parameter file created by RENUMF90
DATAFILE
renf90.dat
NUMBER_OF_TRAITS
1
NUMBER_OF_EFFECTS
3
OBSERVATION(S)
1
WEIGHT(S)
EFFECTS: POSITIONS_IN_DATAFILE NUMBER_OF_LEVELS TYPE_OF_EFFECT[EFFECT NESTED]
2 4 cross
3 22 cross
4 3724 cross
RANDOM_RESIDUAL VALUES
77182.
RANDOM_GROUP
3
RANDOM_TYPE
add_an_upginb
FILE
renadd03.ped
(CO)VARIANCES
28212.
OPTION SNP_file bruno_gntps_iid_71
OPTION outcallrate
OPTION saveCleanSNPs
OPTION minfreq 0.01
OPTION map_file snpmap.txt
OPTION excludeCHR 30 31
May be necessary to run the command bellow on the server Setting the stack size to “unlimited” allows the program to allocate memory for these large structures without hitting stack limits. By removing stack size limits, BLUPF90 is less likely to encounter segmentation faults or memory allocation issues that arise when the stack space is insufficient for the computations needed.
ulimit -s unlimited
The parameter card bellow we are going to run using the software blupf90+:
/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno/renf90_ssGWAS1_Ginv.par
# BLUPF90 parameter file created by RENUMF90
DATAFILE
renf90.dat
NUMBER_OF_TRAITS
1
NUMBER_OF_EFFECTS
3
OBSERVATION(S)
1
WEIGHT(S)
EFFECTS: POSITIONS_IN_DATAFILE NUMBER_OF_LEVELS TYPE_OF_EFFECT[EFFECT NESTED]
2 4 cross
3 22 cross
4 3724 cross
RANDOM_RESIDUAL VALUES
77182.
RANDOM_GROUP
3
RANDOM_TYPE
add_an_upginb
FILE
renadd03.ped
(CO)VARIANCES
28212.
OPTION SNP_file bruno_gntps_iid_71_clean
OPTION no_quality_control
OPTION origID
OPTION missing 9999
OPTION saveGInverse
OPTION saveA22Inverse
OPTION snp_p_value
The parameter card bellow we are going to run using the software postGSf90: /home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno/renf90_ssGWAS2_SNPeff.par
# BLUPF90 parameter file created by RENUMF90
DATAFILE
renf90.dat
NUMBER_OF_TRAITS
1
NUMBER_OF_EFFECTS
3
OBSERVATION(S)
1
WEIGHT(S)
EFFECTS: POSITIONS_IN_DATAFILE NUMBER_OF_LEVELS TYPE_OF_EFFECT[EFFECT NESTED]
2 4 cross
3 22 cross
4 3724 cross
RANDOM_RESIDUAL VALUES
77182.
RANDOM_GROUP
3
RANDOM_TYPE
add_an_upginb
FILE
renadd03.ped
(CO)VARIANCES
28212.
OPTION SNP_file bruno_gntps_iid_71_clean
OPTION origID
OPTION no_quality_control
OPTION readGInverse
OPTION readA22Inverse
OPTION map_file snpmap.txt_clean
OPTION snp_p_value
OPTION Manhattan_plot_R
OPTION Manhattan_plot
This code will generate several files, among them:
To make the Manhattan Plot considering Genome Independent Segment we should run the code bellow. This code has part of the code in the file Pft1e3.R
Genome_Assembly_ARS_UCD_1_2 <- read_tsv("/home/bambrozi/2_CORTISOL/GWAS/sequence_report_ARS-UCD1_2.tsv")
library(dplyr)
# Filter the rows and sum the Seq length column
# Assuming your data frame is named Genome_Assembly_ARS_UCD_1_2
L <- Genome_Assembly_ARS_UCD_1_2 %>%
filter(`UCSC style name` %in% paste0("chr", 1:29)) %>%
summarise(total_length = sum(`Seq length`)) %>%
pull(total_length)
# Converting bases to Morgan (1Mb = 1cM (0,01 Morgan))
L_M <- L/10^8
# The Ne measure is based on the article bellow:
Ne <- 66 #(Makanjoula et al., 2020)
NeL <- Ne*L_M
# This is the number of independent segment in the genome.
Me <- (2*NeL)/log10(NeL)
# Calculate Bonferroni threshold (already done)
bonf <- -log10(0.05 / Me)
setwd("/home/bambrozi/2_CORTISOL/GWAS/BLUPF90/EXTREME_PHENO/keep_all_pheno")
# Read in and process data for Manhattan plot
yyy1 <- read.table("chrsnp_pval")
yyy <- yyy1[order(yyy1$V4), ]
zzz <- yyy[which(yyy$V1 == 1 & yyy$V2 == 3), ]
n <- nrow(zzz)
y <- zzz[, 4]
x <- zzz[, 3]
chr1 <- zzz[, 5]
chr <- NULL
pos <- NULL
for (i in unique(yyy$V5)) {
zz <- yyy[yyy$V5 == i, ]
key <- zz$V4
medio <- round(nrow(zz) / 2, 0)
z <- key[medio]
pos <- c(pos, z)
}
chrn <- unique(yyy$V5)
one <- which(chr1 %% 4 == 0)
two <- which(chr1 %% 4 == 1)
three <- which(chr1 %% 4 == 2)
four <- which(chr1 %% 4 == 3)
chr[one] <- "darkgoldenrod"
chr[two] <- "darkorchid"
chr[three] <- "blue"
chr[four] <- "forestgreen"
# Create Manhattan plot with Bonferroni line and legend
png(file = "Pft1e3_manplot_with_bonf_ind_seg.png", width = 20000, height = 10000, res = 600)
par(mfrow = c(1, 1), family = "sans", cex = 1.5, font = 2)
plot(y, x, xaxt = "n", main = "Manhattan Plot SNP p_value - Trait: 1 Effect: 3", xlab = "", ylab = "-log10(p-value)", pch = 20, xlim = c(1, n), ylim = c(0, max(x)), col = chr)
# Add Bonferroni line
abline(h = bonf, col = "red", lwd = 2, lty = 2)
# Add legend for Bonferroni line
legend("topright", legend = "Bonferroni correction for genome independent segments", col = "red", lwd = 2, lty = 2, cex = 1)
axis(1, at = pos, labels = chrn)
dev.off()